In [1]:
DATA_FOLDER = 'Data' # Use the data folder provided in Tutorial 02 - Intro to Pandas.
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from dateutil.parser import parse
from os import listdir
from os.path import isfile, join
sns.set_context('notebook')
The DATA_FOLDER/ebola
folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.
Use pandas to import these data files into a single Dataframe
.
Using this DataFrame
, calculate for each country, the daily average per month of new cases and deaths.
Make sure you handle all the different expressions for new cases and deaths that are used in the reports.
First, we define some helpful functions that will help us during the parsing of the data.
In [2]:
def get_files(country):
path = DATA_FOLDER + "/ebola/" + country + "_data/"
return [f for f in listdir(path) if isfile(join(path, f))]
In [3]:
def sum_row(row, total_col):
return float(row[total_col].values[0])
def sum_rows(rows, total_col):
tot = 0
for row in rows:
tot += sum_row(row, total_col)
return tot
Now, we define for each country a function, which, for a given file, returns a dictionnary with the country, date, upper and lower bounds for the new cases, and upper and lower bounds for the new deaths. As we don't know if the new cases / deaths for the 'probable' and 'suspected' cases is reliable, we decided to create an upper bound with the sum of the 'confirmed', 'probable' and 'suspected' new cases / deaths, and a lower bound with only the 'confirmed' new cases / deaths.
The structure of these functions are the same for each country, only the name of the descrption of the data changes.
In [4]:
def get_row_guinea(file):
country = 'guinea'
date = file[:10]
raw = pd.read_csv(DATA_FOLDER + "/ebola/" + country + "_data/" + file)
total_col = "Totals"
new_cases_lower = sum_row(raw[raw.Description == "New cases of confirmed"], total_col)
new_cases_upper = sum_row(raw[raw.Description == "Total new cases registered so far"], total_col)
new_deaths_lower = sum_row(raw[(raw.Description == "New deaths registered today (confirmed)") | (raw.Description == "New deaths registered")], total_col)
new_deaths_upper = sum_row(raw[(raw.Description == "New deaths registered today") | (raw.Description == "New deaths registered")], total_col)
return {'Country' : country, 'Date' : parse(date), 'NewCasesLower' : new_cases_lower, 'NewCasesUpper' : new_cases_upper, 'NewDeathsLower' : new_deaths_lower, 'NewDeathsUpper' : new_deaths_upper}
In [5]:
def get_row_liberia(file):
country = 'liberia'
date = file[:10]
raw = pd.read_csv(DATA_FOLDER + "/ebola/" + country + "_data/" + file).fillna(0)
total_col = "National"
new_cases_lower = sum_row(raw[raw.Variable == "New case/s (confirmed)"], total_col)
list_cases_upper = (["New Case/s (Suspected)",
"New Case/s (Probable)",
"New case/s (confirmed)"])
new_cases_upper = sum_rows([raw[raw.Variable == row] for row in list_cases_upper], total_col)
new_deaths_lower = sum_row(raw[raw.Variable == "Newly reported deaths"], total_col)
new_deaths_upper = new_deaths_lower
return {'Country' : country, 'Date' : parse(date), 'NewCasesLower' : new_cases_lower, 'NewCasesUpper' : new_cases_upper, 'NewDeathsLower' : new_deaths_lower, 'NewDeathsUpper' : new_deaths_upper}
As the files for the Sierra Leone does not contain data for the new deaths, we first extract the total deaths for each day, and we will process them later to get the new deaths.
In [6]:
def get_row_sl(file):
country = 'sl'
date = file[:10]
raw = pd.read_csv(DATA_FOLDER + "/ebola/" + country + "_data/" + file).fillna(0)
total_col = "National"
new_cases_lower = sum_row(raw[raw.variable == "new_confirmed"], total_col)
list_cases_upper = (["new_suspected",
"new_probable",
"new_confirmed"])
new_cases_upper = sum_rows([raw[raw.variable == row] for row in list_cases_upper], total_col)
list_death_upper = (["death_suspected",
"death_probable",
"death_confirmed"])
total_death_upper = sum_rows([raw[raw.variable == row] for row in list_death_upper], total_col)
total_death_lower = sum_row(raw[raw.variable == "death_confirmed"], total_col)
return {'Country' : country, 'Date' : parse(date), 'NewCasesLower' : new_cases_lower, 'NewCasesUpper' : new_cases_upper, 'TotalDeathLower' : total_death_lower, 'TotalDeathUpper' : total_death_upper}
In [7]:
rows_guinea = [get_row_guinea(file) for file in get_files("guinea")]
In [8]:
rows_liberia = [get_row_liberia(file) for file in get_files("liberia")]
We now transform the data for the Sierra Leone :
In [9]:
rows_sl_total_deaths = [get_row_sl(file) for file in get_files("sl")]
dic_sl_total_deaths = {}
for row in rows_sl_total_deaths:
dic_sl_total_deaths[row['Date']] = row
rows_sl = []
for date, entry in dic_sl_total_deaths.items():
date_before = date - datetime.timedelta(days=1)
if date_before in dic_sl_total_deaths:
if entry['TotalDeathUpper'] != 0 and dic_sl_total_deaths[date_before]['TotalDeathUpper'] != 0 and entry['TotalDeathLower'] != 0 and dic_sl_total_deaths[date_before]['TotalDeathLower'] != 0:
copy = dict(entry)
del copy['TotalDeathUpper']
del copy['TotalDeathLower']
copy['NewDeathsUpper'] = entry['TotalDeathUpper'] - dic_sl_total_deaths[date_before]['TotalDeathUpper']
copy['NewDeathsLower'] = entry['TotalDeathLower'] - dic_sl_total_deaths[date_before]['TotalDeathLower']
rows_sl.append(copy)
We can now insert the data in a dataframe. For Liberia, December's data is in a completely different format so we dropped it: for instance, for some days, the new cases are the new cases for the day and for some other they are the total cases for this country.
In [10]:
raw_dataframe = pd.DataFrame(columns=['Country', 'Date', 'NewCasesLower', 'NewCasesUpper', 'NewDeathsLower', 'NewDeathsUpper'])
for row in rows_sl, rows_guinea:
raw_dataframe = raw_dataframe.append(row, ignore_index = True)
for row in rows_liberia:
if row['Date'].month != 12: #December data is erroneous
raw_dataframe = raw_dataframe.append(row, ignore_index = True)
raw_dataframe
Out[10]:
In [11]:
dataframe = raw_dataframe.set_index(['Country', 'Date'])
In [12]:
dataframe_no_day = raw_dataframe
dataframe_no_day['Year'] = raw_dataframe['Date'].apply(lambda x: x.year)
dataframe_no_day['Month'] = raw_dataframe['Date'].apply(lambda x: x.month)
final_df = dataframe_no_day[['Country', 'Year', 'Month', 'NewCasesLower', 'NewCasesUpper', 'NewDeathsLower', 'NewDeathsUpper']].groupby(['Country', 'Year', 'Month']).mean()
final_df
Out[12]:
Finally, to have some final general idea for the data, we average the bounds.
In [13]:
s1 = final_df[['NewCasesLower', 'NewCasesUpper']].mean(axis=1)
s2 = final_df[['NewDeathsLower', 'NewDeathsUpper']].mean(axis=1)
final = pd.concat([s1, s2], axis=1)
final.columns = ['NewCasesAverage', 'NewDeathsAverage']
final
Out[13]:
In the DATA_FOLDER/microbiome
subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10th file that describes the content of each.
Use pandas to import the first 9 spreadsheets into a single DataFrame
.
Then, add the metadata information from the 10th spreadsheet as columns in the combined DataFrame
.
Make sure that the final DataFrame
has a unique index and all the NaN
values have been replaced by the tag unknown
.
We load the first spreadsheet from the file's Sheet 1. Then we add a new column that is the same for all the data in this import, which corresponds to the barcode of the code.
Then we rename the columns for more clarity.
In [14]:
mid = pd.read_excel(DATA_FOLDER + '/microbiome/MID1.xls', sheetname='Sheet 1', header=None)
mid.fillna('unknown', inplace=True)
mid['BARCODE'] = 'MID1'
mid.columns = ['Taxon', 'Count', 'BARCODE']
Now we repeat this operation for every other spreadsheet except the metadata. At each iteration we simply concatenate the data at the end of the previous data, this accumulating all the files' data into a single dataframe. We don't care about any index right now since we will use a random one later.
In [15]:
for i in range(2, 10):
midi = pd.read_excel(DATA_FOLDER + '/microbiome/MID' + str(i) + '.xls', sheetname='Sheet 1', header=None)
midi.fillna('unknown', inplace=True)
midi['BARCODE'] = 'MID' + str(i)
midi.columns = ['Taxon', 'Count', 'BARCODE']
mid = pd.concat([mid, midi])
Finally, we do a merge with the metadata. We join on the BARCODE
column. This column will be the index of the metadata when we import it in this case. Finally we set the index for the three columns BARCODE
, GROUP
and SAMPLE
which are all the columns of the metada and are unique.
The only NaN
value we found was the NA value on the metadata, which may indicate that there is no sample for the first group. We replaced it anyway by unknown
.
In [16]:
metadata = pd.read_excel(DATA_FOLDER + '/microbiome/metadata.xls', sheetname='Sheet1', index_col=0)
metadata.fillna('unknown', inplace=True)
merged = pd.merge(mid, metadata, right_index=True, left_on='BARCODE')
merged = merged.set_index(keys=['BARCODE', 'Taxon'])
merged
Out[16]:
In [17]:
from IPython.core.display import HTML
HTML(filename=DATA_FOLDER+'/titanic.html')
Out[17]:
For each of the following questions state clearly your assumptions and discuss your findings:
Categorical
. DataFrame
with unique index.1. We start by importing the data from the file.
In [18]:
titanic = pd.read_excel(DATA_FOLDER + '/titanic.xls', sheetname='titanic')
titanic
Out[18]:
Next we can list the data types of each field.
In [19]:
titanic.dtypes
Out[19]:
When it comes to the object
fields, we can be a bit more precise. name
, sec
, ticket
, cabin
, embarked
, boat
and home.dex
are all strings.
Next, we call the describe
method to list some statistics on the data. We thus obtain the range of all of the numeric fields of our data.
In [20]:
titanic.describe()
Out[20]:
Moreover, we can also note some ranges of other fields. For example, sex
has only two possible values female
and male
. embarked
can only be S
, C
and Q
.
For a better visual result, we decided to replace the travel classes, ports to more readable values. As we make them categorical types, the performance stays the same.
In [21]:
class_dic = {1 : 'First Class', 2 : 'Second Class', 3 : 'Third Class', np.nan : np.nan}
survived_dic = {0 : 'Deceased' , 1 : 'Survived', np.nan : np.nan}
emarked_dic = {'C' : 'Cherbourg', 'Q' : 'Queenstown', 'S' : 'Southampton', np.nan : np.nan}
titanic['pclass'] = titanic['pclass'].apply(lambda x: class_dic[x])
titanic['survived'] = titanic['survived'].apply(lambda x: survived_dic[x])
titanic['embarked'] = titanic['embarked'].apply(lambda x: emarked_dic[x])
Then we make categorical data as actually categorical.
In [22]:
titanic['pclass'] = titanic.pclass.astype('category')
titanic['survived'] = titanic.survived.astype('category')
titanic['sex'] = titanic.sex.astype('category')
titanic['embarked'] = titanic.embarked.astype('category')
titanic['cabin'] = titanic.cabin.astype('category')
titanic['boat'] = titanic.boat.astype('category')
2. We plot the histogram of the travel class.
In [23]:
titanic.pclass.value_counts(sort=False).plot(kind='bar')
Out[23]:
Next we plot the histogram of the three embark ports.
In [24]:
titanic.embarked.value_counts().plot(kind='bar')
Out[24]:
Next we plot the histogram of the sex.
In [25]:
titanic.sex.value_counts().plot(kind='bar')
Out[25]:
Next, we cut the ages data into decades and plot the histogram of the devades.
In [26]:
pd.cut(titanic.age, range(0,90,10)).value_counts(sort=False).plot(kind='bar')
Out[26]:
3. We plot the cabin floor data as a pie chart.
In [27]:
titanic.cabin.dropna().apply(lambda x : x[0]).value_counts(sort=False).plot(kind='pie')
Out[27]:
4. Here, we plot the proportion of people that survived in the first class.
In [28]:
titanic[titanic.pclass == "First Class"].survived.value_counts(sort=False).plot(kind='pie')
Out[28]:
Next, we plot the proportion of people that survived in the second class.
In [29]:
titanic[titanic.pclass == "Second Class"].survived.value_counts(sort=False).plot(kind='pie')
Out[29]:
Finally, we plot the proportion of people that survived in the third class.
In [30]:
titanic[titanic.pclass == "Third Class"].survived.value_counts(sort=False).plot(kind='pie')
Out[30]:
As we can see, the lower the class, the higher the probability of death.
5. Here we add new columns that will help us to calculate proportions of survived people in the last part.
In [31]:
titanic.insert(0, 'alive', 0)
titanic.insert(0, 'dead', 0)
titanic.insert(0, 'ratio', 0)
Here we set these new columns to appropriate values. We essentialy separate the survived columns for easier summing later on. Finnaly we slice the data to take only the columns of interest.
In [32]:
titanic.loc[titanic['survived'] == "Survived", 'alive'] = 1
titanic.loc[titanic['survived'] == "Deceased", 'dead'] = 1
df = titanic[['pclass', 'sex', 'alive', 'dead', 'ratio']]
We group the data by the sec and class of the passangers and we sum it. Then we have the sum of alive and dead people groupped as we wish and we can easily calculate the proportion of them that survived, which we plot as a histogram.
In [33]:
aggregated = df.groupby(['sex', 'pclass']).sum()
(aggregated['alive'] / (aggregated['alive'] + aggregated['dead'])).plot(kind='bar')
Out[33]:
We can see that there is a huge difference of survival between the classes and sexes : for instance, the third class males have 7 less times chance of survival than the first class females.
6. Next we insert a new column that will be the age category of each person. Since we wan't to split the people in two equal groups based on age, we compute the median age of passangers. We also drop the passengers with an unknown age value, to avoid bad results for the median computation.
In [34]:
titanic.dropna(axis=0, subset=['age'], inplace=True)
titanic.insert(0, 'age_category', 0)
median = titanic['age'].median()
Next, we set the correct category to people below or above the median age. The people that have the median age are grouped with the people below it. Next we set this column as a categorical column.
In [35]:
titanic.loc[titanic['age'] > median, 'age_category'] = "Age > " + str(median)
titanic.loc[titanic['age'] <= median, 'age_category'] = "Age <= " + str(median)
titanic['age_category'] = titanic.age_category.astype('category')
Next, we take the columns that are of interest to us and group by age category, sec and travel class. Then we sum over these groups, obtaining the people that lived and those that died which which we can compute the proportion and display it as a dataframe.
In [36]:
sub = titanic[['pclass', 'sex', 'age_category', 'alive', 'dead', 'ratio']]
subagg = sub.groupby(['age_category', 'sex', 'pclass']).sum()
subagg['ratio'] = (subagg['alive'] / (subagg['alive'] + subagg['dead']))
only_ratio = subagg[['ratio']]
only_ratio
Out[36]:
As before, we can see that there is a huge difference of survival between the classes and sexes. In the other hand, the age doesn't make a large difference : no matter if the passenger is above or below 28 years old, its probability of survival is more determined by its sex and class.